{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 用户和活动关联关系处理\n",
    "\n",
    "\n",
    "整个数据集中活动数目（events.csv）太多，所以下面的处理我们找出只在训练集和测试集中出现的活动和用户集合，并对他们重新编制索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "#保存数据\n",
    "import pickle as cPickle\n",
    "\n",
    "import itertools\n",
    "\n",
    "#处理事件字符串\n",
    "import datetime\n",
    "\n",
    "import numpy as np\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "\n",
    "#相似度/距离\n",
    "import scipy.spatial.distance as ssd\n",
    "\n",
    "from collections import defaultdict\n",
    "from sklearn.preprocessing import normalize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = 'data'\n",
    "out_path = 'out'\n",
    "\n",
    "train_file_title = os.path.join(data_path, 'train.csv')\n",
    "test_file_title = os.path.join(data_path, 'test.csv')\n",
    "users_file_title = os.path.join(data_path, 'users.csv')\n",
    "events_file_title = os.path.join(data_path, 'events.csv')\n",
    "event_attendees_file_title = os.path.join(data_path, 'event_attendees.csv')\n",
    "user_friends_file_title = os.path.join(data_path, 'user_friends.csv')\n",
    "\n",
    "eventsForUser_file_title = os.path.join(out_path, 'PE_eventsForUser.pkl')\n",
    "usersForEvent_file_title = os.path.join(out_path, 'PE_usersForEvent.pkl')\n",
    "userEventScores_file_title = os.path.join(out_path, 'PE_userEventScores.pkl')\n",
    "userIndex_file_title = os.path.join(out_path, 'PE_userIndex.pkl')\n",
    "eventIndex_file_title = os.path.join(out_path, 'PE_eventIndex.pkl')\n",
    "uniqueUserPairs_file_title = os.path.join(out_path, 'FE_uniqueUserPairs.pkl')\n",
    "uniqueEventPairs_file_title = os.path.join(out_path, 'PE_uniqueEventPairs.pkl')\n",
    "\n",
    "users_useful_file_title = os.path.join(out_path, \"users_useful.csv\")\n",
    "events_useful_file_title = os.path.join(out_path, \"events_useful.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of uniqueUsers :3391\n",
      "number of uniqueEvents :13418\n",
      "interest 4131 not interest 514 unknown_count 10753\n"
     ]
    }
   ],
   "source": [
    " \"\"\"\n",
    "我们只关心train和test中出现的user和event，因此重点处理这部分关联数据\n",
    "\n",
    "train.csv 有6列：\n",
    "user：用户ID\n",
    "event：活动ID\n",
    "invited：是否被邀请（0/1）\n",
    "timestamp：ISO-8601 UTC格式时间字符串，表示用户看到该活动的时间\n",
    "interested, and not_interested\n",
    "\n",
    "Test.csv 除了没有interested, and not_interested，其余列与train相同\n",
    " \"\"\"\n",
    "    \n",
    "# 统计训练集中有多少不同的用户的events\n",
    "uniqueUsers = set()\n",
    "uniqueEvents = set()\n",
    "\n",
    "#倒排表\n",
    "#统计每个用户参加的活动   / 每个活动参加的用户\n",
    "eventsForUser = defaultdict(set)\n",
    "usersForEvent = defaultdict(set)\n",
    "    \n",
    "for filename in [train_file_title, test_file_title]:\n",
    "    f = open(filename, 'rb')\n",
    "    \n",
    "    #忽略第一行（列名字）\n",
    "    f.readline().strip().split(b\",\")\n",
    "    \n",
    "    for line in f:    #对每条记录\n",
    "        cols = line.strip().split(b\",\")\n",
    "        uniqueUsers.add(cols[0])   #第一列为用户ID\n",
    "        uniqueEvents.add(cols[1])   #第二列为活动ID\n",
    "        \n",
    "        #eventsForUser[cols[0]].add(cols[1])    #该用户参加了这个活动\n",
    "        #usersForEvent[cols[1]].add(cols[0])    #该活动被用户参加\n",
    "    f.close()\n",
    "\n",
    "\n",
    "n_uniqueUsers = len(uniqueUsers)\n",
    "n_uniqueEvents = len(uniqueEvents)\n",
    "\n",
    "print(\"number of uniqueUsers :%d\" % n_uniqueUsers)\n",
    "print(\"number of uniqueEvents :%d\" % n_uniqueEvents)\n",
    "\n",
    "#用户关系矩阵表，可用于后续LFM/SVD++处理的输入\n",
    "#这是一个稀疏矩阵，记录用户对活动感兴趣\n",
    "userEventScores = ss.dok_matrix((n_uniqueUsers, n_uniqueEvents))\n",
    "userIndex = dict()\n",
    "eventIndex = dict()\n",
    "\n",
    "#重新编码用户索引字典\n",
    "for i, u in enumerate(uniqueUsers):\n",
    "    userIndex[u] = i\n",
    "    \n",
    "#重新编码活动索引字典    \n",
    "for i, e in enumerate(uniqueEvents):\n",
    "    eventIndex[e] = i\n",
    "\n",
    "n_records = 0\n",
    "ftrain = open(train_file_title, 'rb')\n",
    "ftrain.readline()\n",
    "\n",
    "interest_count = 0\n",
    "not_interest_count = 0\n",
    "unknown_count = 0\n",
    "for line in ftrain:\n",
    "    cols = line.strip().split(b\",\")\n",
    "    i = userIndex[cols[0]]  #用户\n",
    "    j = eventIndex[cols[1]] #活动\n",
    "    \n",
    "    eventsForUser[i].add(j)    #该用户参加了这个活动\n",
    "    usersForEvent[j].add(i)    #该活动被用户参加\n",
    "        \n",
    "    #userEventScores[i, j] = int(cols[4]) - int(cols[5])   #interested - not_interested\n",
    "    if int(cols[4]) == 1:\n",
    "        assert int(cols[5]) == 0\n",
    "        interest_count += 1\n",
    "        userEventScores[i, j] = 1\n",
    "    elif int(cols[5]) == 1:\n",
    "        not_interest_count += 1\n",
    "        userEventScores[i, j] = -1\n",
    "    else:\n",
    "        unknown_count += 1\n",
    "        userEventScores[i, j] = -0.1\n",
    "        \n",
    "print('interest', interest_count, 'not interest', not_interest_count, 'unknown_count', unknown_count)\n",
    "ftrain.close()\n",
    "\n",
    "  \n",
    "##统计每个用户参加的活动，后续用于将用户朋友参加的活动影响到用户\n",
    "cPickle.dump(eventsForUser, open(eventsForUser_file_title, 'wb'))\n",
    "##统计活动参加的用户\n",
    "cPickle.dump(usersForEvent, open(usersForEvent_file_title, 'wb'))\n",
    "\n",
    "#保存用户-活动关系矩阵R，以备后用\n",
    "sio.mmwrite(userEventScores_file_title, userEventScores)\n",
    "\n",
    "\n",
    "#保存用户索引表\n",
    "cPickle.dump(userIndex, open(userIndex_file_title, 'wb'))\n",
    "#保存活动索引表\n",
    "cPickle.dump(eventIndex, open(eventIndex_file_title, 'wb'))\n",
    "\n",
    "    \n",
    "# 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event\n",
    "# 所谓的关联用户，指的是至少在同一个event上有行为的用户pair\n",
    "# 关联的event指的是至少同一个user有行为的event pair\n",
    "uniqueUserPairs = set()\n",
    "uniqueEventPairs = set()\n",
    "for event in uniqueEvents:\n",
    "    i = eventIndex[event]\n",
    "    users = usersForEvent[i]\n",
    "    if len(users) > 2:\n",
    "        uniqueUserPairs.update(itertools.combinations(users, 2))\n",
    "        \n",
    "for user in uniqueUsers:\n",
    "    u = userIndex[user]\n",
    "    events = eventsForUser[u]\n",
    "    if len(events) > 2:\n",
    "        uniqueEventPairs.update(itertools.combinations(events, 2))\n",
    " \n",
    "#保存用户-事件关系对索引表\n",
    "cPickle.dump(uniqueUserPairs, open(uniqueUserPairs_file_title, 'wb'))\n",
    "cPickle.dump(uniqueEventPairs, open(uniqueEventPairs_file_title, 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(b'user_id,locale,birthyear,gender,joinedAt,location,timezone\\n',\n",
       " 38209,\n",
       " 3391,\n",
       " 0)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#训练集和测试集中出现的用户数目和事件数目远小于users.csv出现的用户数和events.csv出现的事件数\n",
    "users_total_line = 0\n",
    "users_useful_total_line = 0\n",
    "users_empty_line = 0\n",
    "with open(users_file_title, 'rb') as fSource:\n",
    "    users_file_col_name = fSource.readline()\n",
    "    with open(users_useful_file_title, 'wb') as fDest:\n",
    "        fDest.write(users_file_col_name)\n",
    "        for line in fSource:\n",
    "            if not line.strip():\n",
    "                users_empty_line = users_empty_line + 1\n",
    "                continue\n",
    "            users_total_line = users_total_line + 1\n",
    "            cols = line.split(b',')\n",
    "            if cols[0] in userIndex:\n",
    "                #convert origin id to new id\n",
    "                #cols.append(b'\\n')\n",
    "                cols[0] = str(userIndex[cols[0]]).encode()\n",
    "                fDest.write(line)\n",
    "                \n",
    "                users_useful_total_line = users_useful_total_line + 1\n",
    "                \n",
    "users_file_col_name, users_total_line, users_useful_total_line, users_empty_line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(b'event_id,user_id,start_time,city,state,zip,country,lat,lng,c_1,c_2,c_3,c_4,c_5,c_6,c_7,c_8,c_9,c_10,c_11,c_12,c_13,c_14,c_15,c_16,c_17,c_18,c_19,c_20,c_21,c_22,c_23,c_24,c_25,c_26,c_27,c_28,c_29,c_30,c_31,c_32,c_33,c_34,c_35,c_36,c_37,c_38,c_39,c_40,c_41,c_42,c_43,c_44,c_45,c_46,c_47,c_48,c_49,c_50,c_51,c_52,c_53,c_54,c_55,c_56,c_57,c_58,c_59,c_60,c_61,c_62,c_63,c_64,c_65,c_66,c_67,c_68,c_69,c_70,c_71,c_72,c_73,c_74,c_75,c_76,c_77,c_78,c_79,c_80,c_81,c_82,c_83,c_84,c_85,c_86,c_87,c_88,c_89,c_90,c_91,c_92,c_93,c_94,c_95,c_96,c_97,c_98,c_99,c_100,c_other\\n',\n",
       " 3137972,\n",
       " 13418,\n",
       " 0)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#训练集和测试集中出现的用户数目和事件数目远小于users.csv出现的用户数和events.csv出现的事件数\n",
    "events_total_line = 0\n",
    "events_useful_total_line = 0\n",
    "events_empty_line = 0\n",
    "with open(events_file_title, 'rb') as fSource:\n",
    "    events_file_col_name = fSource.readline()\n",
    "    with open(events_useful_file_title, 'wb') as fDest:\n",
    "        fDest.write(events_file_col_name)\n",
    "        for line in fSource:\n",
    "            if not line.strip():\n",
    "                events_empty_line = events_empty_line + 1\n",
    "                continue\n",
    "            events_total_line = events_total_line + 1\n",
    "            \n",
    "            cols = line.split(b',')\n",
    "            if cols[0] in eventIndex:\n",
    "                #convert origin id to new id\n",
    "                #cols.append(b'\\n')\n",
    "                cols[0] = str(eventIndex[cols[0]]).encode()\n",
    "                line = b','.join(cols)\n",
    "                fDest.write(line)\n",
    "                \n",
    "                events_useful_total_line = events_useful_total_line + 1\n",
    "                \n",
    "events_file_col_name, events_total_line, events_useful_total_line, events_empty_line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
